install.packages("e1071")
trying URL 'https://cran.rstudio.com/bin/macosx/el-capitan/contrib/3.6/e1071_1.7-3.tgz'
Content type 'application/x-gzip' length 900756 bytes (879 KB)
==================================================
downloaded 879 KB

The downloaded binary packages are in
    /var/folders/w8/ct2m1fx50sj6m94jlql506gc0000gp/T//RtmplFqbG3/downloaded_packages
library(e1071)

Attaching package: ‘e1071’

The following object is masked _by_ ‘.GlobalEnv’:

    sigmoid
library(MASS)
n <- 100
set.seed(2018)
x <- rbind(mvrnorm(n/2, c(-2,-2), diag(2)), mvrnorm(n/2, c(2,2), diag(2)))
y <- as.factor(rep(c(1,2), each=n/2))
dat <- data.frame(x1 = x[,1], x2= x[,2], y)
with(dat, plot(x2, x1, pch=18, col=y, asp=1))

head(x)
           [,1]       [,2]
[1,] -1.4125573 -2.4229840
[2,] -1.4725563 -3.5498782
[3,] -2.5960212 -2.0644293
[4,] -0.7286876 -1.7291186
[5,] -1.7987455 -0.2647163
[6,] -2.4556339 -2.2647112
head(dat)
tail(dat)

Want to compute SVM for above data.

summary(svm1)

Call:
svm(formula = y ~ x1 + x2, data = dat, kernel = "linear", cost = 1000)


Parameters:
   SVM-Type:  C-classification 
 SVM-Kernel:  linear 
       cost:  1000 

Number of Support Vectors:  4

 ( 2 2 )


Number of Classes:  2 

Levels: 
 1 2
x[svm1$index,]
            [,1]       [,2]
[1,] -1.12392840  0.9559452
[2,]  0.70176410 -0.4910098
[3,] -0.10162977  0.2994029
[4,] -0.06858902  0.1619788
plot(svm1, dat)

summary(svm1)

Call:
svm(formula = y ~ x1 + x2, data = dat, kernel = "linear", cost = 1000)


Parameters:
   SVM-Type:  C-classification 
 SVM-Kernel:  linear 
       cost:  1000 

Number of Support Vectors:  2

 ( 1 1 )


Number of Classes:  2 

Levels: 
 1 2
x[svm1$index,]
           [,1]      [,2]
[1,] -1.0651731 0.0994707
[2,] -0.3689705 0.9629019
plot(svm1, dat)

Now perform with new data

xnew <- rbind(mvrnorm(n/2, c(-2,-2), diag(2)), mvrnorm(n/2, c(2,2), diag(2)))
ynew <- as.factor(rep(c(1,2), each=n/2))
ypred <- predict(svm1, xnew)
table(ypred, ynew)
     ynew
ypred  1  2
    1 49  0
    2  1 50
sum(ynew != ypred)/n
[1] 0.01

Now consider case where variance is large, so no separating hyperplane exists.

set.seed(2018)
x <- rbind(mvrnorm(n/2, c(-2,-2), 3*diag(2)), mvrnorm(n/2, c(2,2), 3*diag(2)))
y <- as.factor(rep(c(1,2), each=n/2))
dat <- data.frame(x1 = x[,1], x2= x[,2], y)
with(dat, plot(x2, x1, pch=18, col=y, asp=1))

svm2 <- svm(y~x1+x2, data = dat, kernel = "linear", cost = 1000000000000)

WARNING: reaching max number of iterations
summary(svm2)

Call:
svm(formula = y ~ x1 + x2, data = dat, kernel = "linear", cost = 1e+12)


Parameters:
   SVM-Type:  C-classification 
 SVM-Kernel:  linear 
       cost:  1e+12 

Number of Support Vectors:  11

 ( 5 6 )


Number of Classes:  2 

Levels: 
 1 2
x[svm2$index,]
             [,1]       [,2]
 [1,] -1.65141691  1.0055995
 [2,] -0.89545438 -0.8965873
 [3,]  0.36586275 -1.6458518
 [4,] -6.49893758  0.4430950
 [5,] -3.59851617  0.7802958
 [6,] -1.94633386  0.7485015
 [7,]  0.06828258  1.0112160
 [8,] -0.16203453 -0.1150225
 [9,]  0.24195013 -1.0598300
[10,] -2.10317722  0.2036933
[11,]  0.15982206 -1.4821284
plot(svm2, dat)

xnew <- rbind(mvrnorm(n/2, c(-2,-2), 3*diag(2)), mvrnorm(n/2, c(2,2), 3*diag(2)))
ynew <- as.factor(rep(c(1,2), each=n/2))
ypred <- predict(svm2, xnew)
table(ypred, ynew)
     ynew
ypred  1  2
    1 18 48
    2 32  2
sum(ynew != ypred)/n
[1] 0.8

Can see need much more support vectors here. Cost -> /infty, number of support vectors -> optimal number of support vectors.


Kernel SVM

for(i in seq(1, 1000, by = 100)){
  svm3 <- svm(y~x1+x2, data=dat, kernel = "radial", gamma = 1, cost = i)
  plot(svm3, dat)
  ypred <- predict(svm3, xnew)
  print(sum(ynew != ypred)/n)
}
[1] 0.1

[1] 0.1

[1] 0.11

[1] 0.11

[1] 0.11

[1] 0.11

[1] 0.1

[1] 0.1

[1] 0.1

[1] 0.11

Cost increases -> regions get more defined (but pred error also increases?)

for(i in seq(1, 100, by=10)){
  svm3 <- svm(y~x1+x2, data=dat, kernel = "radial", gamma = i, cost = 1000)
  plot(svm3, dat)
  ypred <- predict(svm3, xnew)
  print(sum(ynew != ypred)/n)
}
[1] 0.11

[1] 0.13

[1] 0.14

[1] 0.17

[1] 0.21

[1] 0.22

[1] 0.23

[1] 0.23

[1] 0.24

[1] 0.25

Same for gamma. Both cases -> leads to overfitting.

Now use tuning to select optimal params.

tune.out <- tune(svm, y~x1+x2, data=dat, kernel="radial",
ranges = list(cost=10^c((-3):3), gamma=10^c((-3):3)))
summary(tune.out)

Parameter tuning of ‘svm’:

- sampling method: 10-fold cross validation 

- best parameters:

- best performance: 0.07 

- Detailed performance results:
NA

Fit and check pred error.

svm3 <- svm(y~x1+x2, data=dat, kernel = "radial", gamma = 0.001, cost = 100)
plot(svm3, dat)
ypred <- predict(svm3, xnew)
print(sum(ynew != ypred)/n)

MNIST SVM

filePath <- "https://raw.githubusercontent.com/AJCoca/SLP19/master/"
fieName <- "mnist.csv"
mnist <- read.csv(paste0(filePath, fileName), header = TRUE)
mnist$digit <- as.factor(mnist$digit)

train <- mnist[1:4000,]
identical <- apply(train, 2, function(v){all(v==v[1])})
train <- train[,!identical] # remove redundant pixels
test <- mnist[4001:6000,!identical]
tune.out <- tune(svm, digit~., data=train, kernel="radial",
ranges = list(cost=10^c((-3):3), gamma=10^c((-3):3)))
Variable(s) ‘pixel.51’ and ‘pixel.143’ and ‘pixel.363’ and ‘pixel.391’ and ‘pixel.671’ constant. Cannot scale data.

Carry out SVM tuning to get optimal cost & gamma params.

tune.out <- tune(svm, digit~., data=train, kernel="radial",
ranges = list(cost=10^c((-3):3), gamma=10^c((-3):3)))
summary(tune.out)
(1-pnorm((0.472/0.0676)))*2
[1] 2.90501e-12
LS0tCnRpdGxlOiAiUHJhYyA3IgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgpgYGB7cn0KaW5zdGFsbC5wYWNrYWdlcygiZTEwNzEiKQpsaWJyYXJ5KGUxMDcxKQpgYGAKCmBgYHtyfQpsaWJyYXJ5KE1BU1MpCm4gPC0gMTAwCnNldC5zZWVkKDIwMTgpCnggPC0gcmJpbmQobXZybm9ybShuLzIsIGMoLTIsLTIpLCBkaWFnKDIpKSwgbXZybm9ybShuLzIsIGMoMiwyKSwgZGlhZygyKSkpCnkgPC0gYXMuZmFjdG9yKHJlcChjKDEsMiksIGVhY2g9bi8yKSkKZGF0IDwtIGRhdGEuZnJhbWUoeDEgPSB4WywxXSwgeDI9IHhbLDJdLCB5KQp3aXRoKGRhdCwgcGxvdCh4MiwgeDEsIHBjaD0xOCwgY29sPXksIGFzcD0xKSkKYGBgCmBgYHtyfQpoZWFkKHgpCmhlYWQoZGF0KQp0YWlsKGRhdCkKYGBgCgpXYW50IHRvIGNvbXB1dGUgU1ZNIGZvciBhYm92ZSBkYXRhLgoKYGBge3J9CnN2bTEgPC0gc3ZtKHl+eDEreDIsIGRhdGEgPSBkYXQsIGtlcm5lbCA9ICJsaW5lYXIiLCBjb3N0ID0gMTAwMCkKYGBgCgpgYGB7cn0Kc3VtbWFyeShzdm0xKQp4W3N2bTEkaW5kZXgsXQpwbG90KHN2bTEsIGRhdCkKYGBgCgpOb3cgcGVyZm9ybSB3aXRoIG5ldyBkYXRhCgpgYGB7cn0KeG5ldyA8LSByYmluZChtdnJub3JtKG4vMiwgYygtMiwtMiksIGRpYWcoMikpLCBtdnJub3JtKG4vMiwgYygyLDIpLCBkaWFnKDIpKSkKeW5ldyA8LSBhcy5mYWN0b3IocmVwKGMoMSwyKSwgZWFjaD1uLzIpKQp5cHJlZCA8LSBwcmVkaWN0KHN2bTEsIHhuZXcpCnRhYmxlKHlwcmVkLCB5bmV3KQpzdW0oeW5ldyAhPSB5cHJlZCkvbgpgYGAKCk5vdyBjb25zaWRlciBjYXNlIHdoZXJlIHZhcmlhbmNlIGlzIGxhcmdlLCBzbyBubyBzZXBhcmF0aW5nIGh5cGVycGxhbmUgZXhpc3RzLgoKYGBge3J9CnNldC5zZWVkKDIwMTgpCnggPC0gcmJpbmQobXZybm9ybShuLzIsIGMoLTIsLTIpLCAzKmRpYWcoMikpLCBtdnJub3JtKG4vMiwgYygyLDIpLCAzKmRpYWcoMikpKQp5IDwtIGFzLmZhY3RvcihyZXAoYygxLDIpLCBlYWNoPW4vMikpCmRhdCA8LSBkYXRhLmZyYW1lKHgxID0geFssMV0sIHgyPSB4WywyXSwgeSkKd2l0aChkYXQsIHBsb3QoeDIsIHgxLCBwY2g9MTgsIGNvbD15LCBhc3A9MSkpCnN2bTIgPC0gc3ZtKHl+eDEreDIsIGRhdGEgPSBkYXQsIGtlcm5lbCA9ICJsaW5lYXIiLCBjb3N0ID0gMTAwMDAwMDAwMDAwMCkKc3VtbWFyeShzdm0yKQp4W3N2bTIkaW5kZXgsXQpwbG90KHN2bTIsIGRhdCkKeG5ldyA8LSByYmluZChtdnJub3JtKG4vMiwgYygtMiwtMiksIDMqZGlhZygyKSksIG12cm5vcm0obi8yLCBjKDIsMiksIDMqZGlhZygyKSkpCnluZXcgPC0gYXMuZmFjdG9yKHJlcChjKDEsMiksIGVhY2g9bi8yKSkKeXByZWQgPC0gcHJlZGljdChzdm0yLCB4bmV3KQp0YWJsZSh5cHJlZCwgeW5ldykKc3VtKHluZXcgIT0geXByZWQpL24KYGBgCgpDYW4gc2VlIG5lZWQgbXVjaCBtb3JlIHN1cHBvcnQgdmVjdG9ycyBoZXJlLiBDb3N0IC0+IC9pbmZ0eSwgbnVtYmVyIG9mIHN1cHBvcnQgdmVjdG9ycyAtPiBvcHRpbWFsIG51bWJlciBvZiBzdXBwb3J0IHZlY3RvcnMuCgotLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tCgpLZXJuZWwgU1ZNCgpgYGB7cn0KZm9yKGkgaW4gc2VxKDEsIDEwMDAsIGJ5ID0gMTAwKSl7CiAgc3ZtMyA8LSBzdm0oeX54MSt4MiwgZGF0YT1kYXQsIGtlcm5lbCA9ICJyYWRpYWwiLCBnYW1tYSA9IDEsIGNvc3QgPSBpKQogIHBsb3Qoc3ZtMywgZGF0KQogIHlwcmVkIDwtIHByZWRpY3Qoc3ZtMywgeG5ldykKICBwcmludChzdW0oeW5ldyAhPSB5cHJlZCkvbikKfQpgYGAKQ29zdCBpbmNyZWFzZXMgLT4gcmVnaW9ucyBnZXQgbW9yZSBkZWZpbmVkIChidXQgcHJlZCBlcnJvciBhbHNvIGluY3JlYXNlcz8pCgpgYGB7cn0KZm9yKGkgaW4gc2VxKDEsIDEwMCwgYnk9MTApKXsKICBzdm0zIDwtIHN2bSh5fngxK3gyLCBkYXRhPWRhdCwga2VybmVsID0gInJhZGlhbCIsIGdhbW1hID0gaSwgY29zdCA9IDEwMDApCiAgcGxvdChzdm0zLCBkYXQpCiAgeXByZWQgPC0gcHJlZGljdChzdm0zLCB4bmV3KQogIHByaW50KHN1bSh5bmV3ICE9IHlwcmVkKS9uKQp9CmBgYApTYW1lIGZvciBnYW1tYS4KQm90aCBjYXNlcyAtPiBsZWFkcyB0byBvdmVyZml0dGluZy4KCk5vdyB1c2UgdHVuaW5nIHRvIHNlbGVjdCBvcHRpbWFsIHBhcmFtcy4KCmBgYHtyfQp0dW5lLm91dCA8LSB0dW5lKHN2bSwgeX54MSt4MiwgZGF0YT1kYXQsIGtlcm5lbD0icmFkaWFsIiwKcmFuZ2VzID0gbGlzdChjb3N0PTEwXmMoKC0zKTozKSwgZ2FtbWE9MTBeYygoLTMpOjMpKSkKc3VtbWFyeSh0dW5lLm91dCkKYGBgCkZpdCBhbmQgY2hlY2sgcHJlZCBlcnJvci4KCmBgYHtyfQpzdm0zIDwtIHN2bSh5fngxK3gyLCBkYXRhPWRhdCwga2VybmVsID0gInJhZGlhbCIsIGdhbW1hID0gMC4wMDEsIGNvc3QgPSAxMDApCnBsb3Qoc3ZtMywgZGF0KQp5cHJlZCA8LSBwcmVkaWN0KHN2bTMsIHhuZXcpCnByaW50KHN1bSh5bmV3ICE9IHlwcmVkKS9uKQpgYGAKCk1OSVNUIFNWTQoKYGBge3J9CmZpbGVQYXRoIDwtICJodHRwczovL3Jhdy5naXRodWJ1c2VyY29udGVudC5jb20vQUpDb2NhL1NMUDE5L21hc3Rlci8iCmZpZU5hbWUgPC0gIm1uaXN0LmNzdiIKbW5pc3QgPC0gcmVhZC5jc3YocGFzdGUwKGZpbGVQYXRoLCBmaWxlTmFtZSksIGhlYWRlciA9IFRSVUUpCm1uaXN0JGRpZ2l0IDwtIGFzLmZhY3RvcihtbmlzdCRkaWdpdCkKCnRyYWluIDwtIG1uaXN0WzE6NDAwMCxdCmlkZW50aWNhbCA8LSBhcHBseSh0cmFpbiwgMiwgZnVuY3Rpb24odil7YWxsKHY9PXZbMV0pfSkKdHJhaW4gPC0gdHJhaW5bLCFpZGVudGljYWxdICMgcmVtb3ZlIHJlZHVuZGFudCBwaXhlbHMKdGVzdCA8LSBtbmlzdFs0MDAxOjYwMDAsIWlkZW50aWNhbF0KYGBgCgoKYGBge3J9Cm1uaXN0LnN2bTEgPC0gc3ZtKGRpZ2l0fi4sIGRhdGEgPSB0cmFpbiwga2VybmVsID0gImxpbmVhciIpCnByZWQgPC0gcHJlZGljdChtbmlzdC5zdm0xLCB0ZXN0KQp0YWJsZSh0ZXN0JGRpZ2l0LCBwcmVkKSAjIGNvbmZ1c2lvbiBtYXRyaXgKc3VtKHRlc3QkZGlnaXQgIT0gcHJlZCkgLyAyMDAwICMgdGVzdCBlcnJvcgojIDAuMDk1CgptbmlzdC5zdm0yIDwtIHN2bShkaWdpdH4uLCBkYXRhID0gdHJhaW4sIGtlcm5lbCA9ICJyYWRpYWwiKSAjIHRoaXMgY2FuIGJlIHNsb3cKcHJlZCA8LSBwcmVkaWN0KG1uaXN0LnN2bTIsIHRlc3QpCnRhYmxlKHRlc3QkZGlnaXQsIHByZWQpICMgY29uZnVzaW9uIG1hdHJpeApzdW0odGVzdCRkaWdpdCAhPSBwcmVkKSAvIDIwMDAKYGBgCgpDYXJyeSBvdXQgU1ZNIHR1bmluZyB0byBnZXQgb3B0aW1hbCBjb3N0ICYgZ2FtbWEgcGFyYW1zLgoKYGBge3J9CnR1bmUub3V0IDwtIHR1bmUoc3ZtLCBkaWdpdH4uLCBkYXRhPXRyYWluLCBrZXJuZWw9InJhZGlhbCIsCnJhbmdlcyA9IGxpc3QoY29zdD0xMF5jKCgtMyk6MyksIGdhbW1hPTEwXmMoKC0zKTozKSkpCnN1bW1hcnkodHVuZS5vdXQpCmBgYAoKCmBgYHtyfQooMS1wbm9ybSgoMC40NzIvMC4wNjc2KSkpKjIKYGBgCgoKCgoKCgoKCgoKCgoKCgoKCg==